My project addresses how to find a linear model to forecast the future values of a monthly time series. I accomplished this task by determining potential ARIMA models through the autocorrelation and partial autocorrelation functions of the stationary data. I also analyzed the AICc values. After testing potential models for independence, I began the forecasting process. I plotted the original time series but removed the 10 last data points. I used the forecast() function on each model to predict ten points ahead and plotted the prediction onto the time series. Through this method, I concluded that an ARIMA(9,3,12) model created the closest fit to the ten missing points.
My data set contains the monthly number of adults ages 30-34 who are diagnosed with AIDS in the United states from 1993 to 2002. By forecasting the future number of cases, medical suppliers and healthcare providers can plan their resources and better care for patients in the later years. I retrieved my data from CDC WONDER, CDC’s database for public-use health data. A full description of the dataset can be found on their website. I used R to analyze the data.
library(MASS)
## Warning: package 'MASS' was built under R version 3.4.3
library(qpcR)
## Loading required package: minpack.lm
## Loading required package: rgl
## Warning: package 'rgl' was built under R version 3.4.3
## Loading required package: robustbase
## Loading required package: Matrix
library(forecast)
## Warning in as.POSIXlt.POSIXct(Sys.time()): unknown timezone 'zone/tz/2018c.
## 1.0/zoneinfo/America/Los_Angeles'
library(ggplot2)
#read data
AIDS <- read.csv("AIDS.csv", header= TRUE,sep=",")
AIDS_ts<-ts(AIDS$Cases, start=c(1993,1), freq=12)
#plot data
ts.plot(AIDS_ts, main="Adults ages 30-34 diagnosed with AIDS in the US, 1993-2002, monthly", ylab="Number of cases", xlab="time")
#boxcox
bcTrans <- boxcox(AIDS_ts ~ as.numeric(1:length(AIDS_ts)))
bcTrans2 <- boxcox(AIDS_ts ~ as.numeric(1:length(AIDS_ts)), lambda=seq(-.5,.5, len=100))
#choose lambda=0, so log transform
y.log <- log(AIDS_ts)
#plot transformation
ts.plot(y.log,main = "Log")
# differencing to remove trend
y.log.diff3 <- diff(y.log,3)
ts.plot(y.log.diff3,main = "Differenced Data 3 times",ylab=expression(paste(nabla,y)))
#differencing at lag1 3 times has lowest variance
#acf, pacf
op <- par(mfrow = c(1,2))
acf(y.log.diff3)
pacf(y.log.diff3)
par(op)
# Calculate AICc for ARMA models with p and q
#suggested model: ARIMA(9,3,4)
aiccs <- matrix(NA, nr = 11, nc = 13)
dimnames(aiccs) = list(p=0:10, q=0:12)
for(p in 0:10)
{
for(q in 0:12)
{
aiccs[p+1,q+1] = AICc(arima(y.log.diff3, order = c(p,0,q), method="ML"))
}
}
aiccs
## q
## p 0 1 2 3 4 5
## 0 -45.26117 -44.13300 -49.20528 -98.17205 -97.02301 -98.24525
## 1 -44.51876 -58.12204 -86.42122 -97.74570 -100.45116 -98.17965
## 2 -48.81665 -61.17443 -90.04957 -99.02602 -96.90065 -96.19086
## 3 -74.87107 -72.72797 -70.52182 -96.81354 -94.47301 -94.63523
## 4 -72.72975 -70.50051 -91.92119 -94.77582 -94.71976 -98.80855
## 5 -70.50369 -68.22881 -81.04330 -92.93250 -94.70753 -96.76475
## 6 -69.07522 -66.83608 -90.92387 -90.97813 -88.53458 -95.27095
## 7 -66.93241 -64.57819 -83.60420 -88.56724 -86.08579 -93.86304
## 8 -64.72652 -71.19911 -96.18601 -86.77014 -98.28442 -93.05992
## 9 -105.95888 -103.57372 -103.83999 -107.78245 -115.12486 -112.49422
## 10 -103.62112 -102.93389 -109.03763 -108.45496 -113.21813 -109.76424
## q
## p 6 7 8 9 10 11
## 0 -95.97585 -95.00469 -94.60843 -95.77398 -96.86195 -97.37413
## 1 -93.66192 -92.78347 -93.48173 -93.56059 -95.32118 -99.41840
## 2 -94.75303 -102.07764 -96.78960 -103.25862 -97.39758 -97.96232
## 3 -102.05958 -99.65943 -97.20928 -101.13086 -98.84381 -100.59925
## 4 -94.64313 -97.44794 -95.02486 -98.75350 -97.59833 -98.54881
## 5 -94.27261 -93.23994 -93.98237 -92.44347 -96.78693 -96.56560
## 6 -97.71835 -92.29330 -97.58880 -93.67505 -95.32422 -95.15804
## 7 -92.79064 -90.81093 -99.44669 -90.84134 -96.50648 -97.65932
## 8 -92.08319 -92.18627 -103.41549 -99.30991 -92.62578 -92.86949
## 9 -111.35722 -110.30131 -107.63040 -105.69607 -104.03337 -102.92212
## 10 -109.56074 -107.50499 -104.68475 -102.94884 -100.11352 -99.96113
## q
## p 12
## 0 -103.50696
## 1 -101.14378
## 2 -98.49308
## 3 -101.15244
## 4 -95.22089
## 5 -101.34034
## 6 -97.79322
## 7 -94.85595
## 8 -99.02252
## 9 -100.87824
## 10 -98.19247
#coefficients of potential models
fit1 = arima(y.log.diff3, order=c(9,3,4),method="ML") #lowest AICc
fit2 = arima(y.log.diff3, order=c(9,3,10),method="ML") #check just in case
fit3 = arima(y.log.diff3, order=c(9,3,12),method="ML") #suggested model by ACF/PACF
fit1
##
## Call:
## arima(x = y.log.diff3, order = c(9, 3, 4), method = "ML")
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7
## -1.6026 -1.6294 -1.5331 -1.2930 -1.0476 -0.7993 -0.3455
## s.e. 0.0984 0.1880 0.2483 0.2838 0.2957 0.2859 0.2554
## ar8 ar9 ma1 ma2 ma3 ma4
## 0.1316 0.0648 -0.9587 -0.0733 -0.9587 1.0000
## s.e. 0.1971 0.1053 0.0517 0.0416 0.0860 0.0676
##
## sigma^2 estimated as 0.02125: log likelihood = 41.35, aic = -54.71
fit2
##
## Call:
## arima(x = y.log.diff3, order = c(9, 3, 10), method = "ML")
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7 ar8
## -0.7650 -0.3291 -0.3175 0.1795 0.1020 -0.1336 0.0769 0.1861
## s.e. 0.1954 0.1938 NaN 0.1553 0.1413 0.1641 0.1964 0.1495
## ar9 ma1 ma2 ma3 ma4 ma5 ma6 ma7
## -0.1146 -1.9193 0.8356 -0.8011 1.3540 -0.0620 -0.3721 0.4405
## s.e. 0.1131 0.1525 0.3674 0.3674 0.1781 0.4545 0.2293 0.3538
## ma8 ma9 ma10
## -0.8922 0.3514 0.0657
## s.e. 0.3590 NaN NaN
##
## sigma^2 estimated as 0.01843: log likelihood = 47.81, aic = -55.62
fit3
##
## Call:
## arima(x = y.log.diff3, order = c(9, 3, 12), method = "ML")
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7 ar8
## -1.3845 -1.2976 -1.6356 -1.2685 -1.0397 -0.7891 0.0031 0.3462
## s.e. NaN NaN NaN NaN NaN NaN NaN 0.4382
## ar9 ma1 ma2 ma3 ma4 ma5 ma6 ma7
## 0.0956 -1.4021 0.3699 -0.4667 0.3423 0.2777 -0.8754 0.9266
## s.e. 0.1640 NaN NaN 0.9555 0.3471 0.9686 0.8964 NaN
## ma8 ma9 ma10 ma11 ma12
## -0.0574 0.2760 0.0593 -0.7391 0.2888
## s.e. NaN 0.7273 0.3270 0.8854 0.4425
##
## sigma^2 estimated as 0.01563: log likelihood = 53.07, aic = -62.13
# Test for independence of residuals
Box.test(residuals(fit1), type="Ljung")
##
## Box-Ljung test
##
## data: residuals(fit1)
## X-squared = 4.1158e-09, df = 1, p-value = 0.9999
Box.test(residuals(fit1), type="Box-Pierce")
##
## Box-Pierce test
##
## data: residuals(fit1)
## X-squared = 4.0093e-09, df = 1, p-value = 0.9999
#test normality of residuals
shapiro.test(residuals(fit1))
##
## Shapiro-Wilk normality test
##
## data: residuals(fit1)
## W = 0.98701, p-value = 0.3436
ts.plot(residuals(fit1),main = "Fitted Residuals")
par(mfrow=c(1,2),oma=c(0,0,2,0))
# Plot diagnostics of residuals
op <- par(mfrow=c(2,2))
# acf
acf(residuals(fit1),main = "Autocorrelation")
acf((residuals(fit1))^2,main = "Autocorrelation") #show dependence/correlation between squartes;typical; use non-linear models
# pacf
pacf(residuals(fit1),main = "Partial Autocorrelation")
# Histogram
hist(residuals(fit1),main = "Histogram")
# q-q plot
qqnorm(residuals(fit1))
qqline(residuals(fit1),col ="blue")
# Add overall title
title("Fitted Residuals Diagnostics", outer=TRUE)
par(op)
# Test for independence of residuals
Box.test(residuals(fit2), type="Ljung")
##
## Box-Ljung test
##
## data: residuals(fit2)
## X-squared = 0.081005, df = 1, p-value = 0.7759
Box.test(residuals(fit2), type="Box-Pierce")
##
## Box-Pierce test
##
## data: residuals(fit2)
## X-squared = 0.07891, df = 1, p-value = 0.7788
#test normality of residuals
shapiro.test(residuals(fit2))
##
## Shapiro-Wilk normality test
##
## data: residuals(fit2)
## W = 0.98743, p-value = 0.3714
ts.plot(residuals(fit2),main = "Fitted Residuals")
par(mfrow=c(1,2),oma=c(0,0,2,0))
# Plot diagnostics of residuals
op <- par(mfrow=c(2,2))
# acf
acf(residuals(fit2),main = "Autocorrelation")
acf((residuals(fit2))^2,main = "Autocorrelation") #show dependence/correlation between squartes;typical; use non-linear models
# pacf
pacf(residuals(fit2),main = "Partial Autocorrelation")
# Histogram
hist(residuals(fit2),main = "Histogram")
# q-q plot
qqnorm(residuals(fit2))
qqline(residuals(fit2),col ="blue")
# Add overall title
title("Fitted Residuals Diagnostics", outer=TRUE)
par(op)
# Test for independence of residuals
Box.test(residuals(fit3), type="Ljung")
##
## Box-Ljung test
##
## data: residuals(fit3)
## X-squared = 0.0059557, df = 1, p-value = 0.9385
Box.test(residuals(fit3), type="Box-Pierce")
##
## Box-Pierce test
##
## data: residuals(fit3)
## X-squared = 0.0058017, df = 1, p-value = 0.9393
#test normality of residuals
shapiro.test(residuals(fit3))
##
## Shapiro-Wilk normality test
##
## data: residuals(fit3)
## W = 0.99343, p-value = 0.869
ts.plot(residuals(fit3),main = "Fitted Residuals")
par(mfrow=c(1,2),oma=c(0,0,2,0))
# Plot diagnostics of residuals
op <- par(mfrow=c(2,2))
# acf
acf(residuals(fit3),main = "Autocorrelation")
acf((residuals(fit3))^2,main = "Autocorrelation") #show dependence/correlation between squartes;typical; use non-linear models
# pacf
pacf(residuals(fit3),main = "Partial Autocorrelation")
# Histogram
hist(residuals(fit3),main = "Histogram")
# q-q plot
qqnorm(residuals(fit3))
qqline(residuals(fit3),col ="blue")
# Add overall title
title("Fitted Residuals Diagnostics", outer=TRUE)
par(op)
#setwd("C:/Users/Vivian/Documents/PSTAT174_project")
#data forecasting
AIDS_ts_mod<-ts(AIDS$Cases, start=c(1993,1), freq=12, end=c(2002,4))
#install.packages("forecast")
#plot data with 5 less points
mod <- window(AIDS_ts_mod)
# fit modified data using our models
fit1_AIDS = arima(mod, order=c(9,3,4),method="ML")
fit2_AIDS = arima(mod, order=c(9,3,10),method="ML")
fit3_AIDS = arima(mod, order=c(9,3,12),method="ML")
# use models to forecast 5 values ahead
fcast1_AIDS <- forecast(fit1_AIDS, h=5)
fcast2_AIDS <- forecast(fit2_AIDS, h=5)
fcast3_AIDS <- forecast(fit3_AIDS, h=5)
plot(fcast1_AIDS, xlab = "year", ylab="Number of cases")
plot(fcast2_AIDS, xlab = "year", ylab="Number of cases")
plot(fcast3_AIDS, xlab = "year", ylab="Number of cases", col="seagreen")
# compare our model fits visually
ts.plot(AIDS_ts_mod, main="Fitted using ARIMA(9,3,4)", ylab="Number of cases", xlab="year")
lines(fitted(fcast1_AIDS), col="goldenrod")
ts.plot(AIDS_ts_mod, main="Fitted using ARIMA(9,3,10)", ylab="Number of cases", xlab="year")
lines(fitted(fcast2_AIDS), col="skyblue")
ts.plot(AIDS_ts_mod, main="Fitted using ARIMA(9,3,12)", ylab="Number of cases", xlab="year")
lines(fitted(fcast3_AIDS), col="seagreen") # this one seems to be the best
# compare the forecasted values with the 5 values that we removed
AIDS <- read.csv("AIDS.csv", header= TRUE,sep=",")
AIDS_ts_orig<-ts(AIDS$Cases, start=c(1993,1), freq=12) # original data
plot(fcast1_AIDS, xlab = "year", ylab="Number of cases")
lines(AIDS_ts_orig)
plot(fcast2_AIDS, xlab = "year", ylab="Number of cases") # this one seems to do better overall
lines(AIDS_ts_orig)
plot(fcast3_AIDS, xlab = "year", ylab="Number of cases")
lines(AIDS_ts_orig)